{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <font color='green'>20newscroups - LDA<font>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### <font color='green'> 1. Description<font>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Topic analysis of documents using LDA.\n",
    "\n",
    "It uses 20 newsgroup dataset that can be taken from scikit-learn function. LDA is applied to this dataset for topic analysis. \n",
    "\n",
    "The algorithm extracts the topics of the document. The program shows the words that belongs to the topics to see if the topic analysis is\n",
    "working properly."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### <font color='green'> 2. Data Preprocessing<font>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# prepare data\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import numpy as np\n",
    "\n",
    "dataset = fetch_20newsgroups(subset='all',\n",
    "                             remove=('headers', 'footers', 'quotes'),\n",
    "                             shuffle=True, random_state=42)\n",
    "\n",
    "vectorizer = CountVectorizer(min_df=5, stop_words='english')\n",
    "vec = vectorizer.fit(dataset.data)\n",
    "X = vec.transform(dataset.data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### <font color='green'> 3. Implementation using Frovedis<font>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train time: 10.070 sec\n"
     ]
    }
   ],
   "source": [
    "# train\n",
    "import os, time\n",
    "from frovedis.decomposition import LatentDirichletAllocation as frovLatentDirichletAllocation \n",
    "from frovedis.exrpc.server import FrovedisServer\n",
    "FrovedisServer.initialize(\"mpirun -ve 2 -np 8 {}\".format(os.environ['FROVEDIS_SERVER']))\n",
    "\n",
    "n_components=20\n",
    "frov_lda = frovLatentDirichletAllocation(n_components=n_components, max_iter=100)\n",
    "t1 = time.time()\n",
    "frov_lda.fit(X)\n",
    "t2 = time.time()\n",
    "print (\"train time: {:.3f} sec\".format(t2-t1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "topic 0: ['db', 'cx', 'w7', 'ah', 'bh', 'mv', 'c_', 'lk', 'uw', 'chz']\n",
      "topic 1: ['00', '50', '10', 'new', '1st', '20', 'adl', 'appears', 'man', 'art']\n",
      "topic 2: ['use', 'just', 'like', 'power', 'used', 'don', 'time', 'water', 'good', 'know']\n",
      "topic 3: ['don', 'just', 'like', 'think', 'people', 'know', 'good', 'say', 'really', 've']\n",
      "topic 4: ['file', 'image', 'files', 'jpeg', 'mail', 'ftp', 'use', 'format', 'gif', 'list']\n",
      "topic 5: ['armenian', 'israel', 'jews', 'turkish', 'people', 'armenians', 'war', 'israeli', 'jewish', 'government']\n",
      "topic 6: ['god', 'jesus', 'people', 'bible', 'does', 'christian', 'church', 'believe', 'christ', 'christians']\n",
      "topic 7: ['key', 'encryption', 'chip', 'use', 'clipper', 'keys', 'government', 'security', 'privacy', 'public']\n",
      "topic 8: ['said', 'people', 'went', 'know', 'didn', 'time', 'just', 'did', 'don', 'like']\n",
      "topic 9: ['health', 'medical', 'disease', 'use', 'cancer', 'patients', 'drug', 'study', 'number', 'hiv']\n",
      "topic 10: ['drive', 'like', 'new', 'thanks', 'hard', 'apple', 'price', 'drives', 'disk', 'mac']\n",
      "topic 11: ['windows', 'dos', 'card', 'use', 'problem', 'pc', 'using', 'does', 'software', 'memory']\n",
      "topic 12: ['space', 'nasa', 'data', 'earth', 'launch', 'program', 'research', 'science', 'shuttle', 'center']\n",
      "topic 13: ['edu', 'com', 'university', 'mail', 'ca', 'send', 'cs', '1993', 'fax', 'information']\n",
      "topic 14: ['window', 'file', 'use', 'available', 'server', 'program', 'edu', 'ftp', 'graphics', 'image']\n",
      "topic 15: ['game', 'team', 'year', 'games', 'play', 'season', 'hockey', 'players', 'league', 'win']\n",
      "topic 16: ['ax', 'max', 'g9v', 'b8f', 'a86', 'pl', '145', '1d9', '0t', '1t']\n",
      "topic 17: ['10', '25', '12', '11', '16', '15', '14', '20', '13', '18']\n",
      "topic 18: ['car', 'like', 'bike', 'just', 'new', 'engine', 'good', 'cars', 'don', 'know']\n",
      "topic 19: ['people', 'government', 'gun', 'president', 'think', 'mr', 'right', 'don', 'law', 'state']\n"
     ]
    }
   ],
   "source": [
    "# predict\n",
    "feature_names = vec.get_feature_names()\n",
    "sorted = np.argsort(frov_lda.components_, axis=1)[:, ::-1]\n",
    "num_words = 10\n",
    "sorted_head = sorted[:,0:num_words]\n",
    "for i in range(n_components):\n",
    "    to_print = []\n",
    "    for j in range(num_words):\n",
    "        to_print.append(feature_names[sorted_head[i,j]])\n",
    "    print (\"topic {0}: {1}\".format(i, to_print))\n",
    "\n",
    "FrovedisServer.shut_down()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### <font color='green'> 4. Implementation using scikit-learn<font>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train time: 533.943 sec\n"
     ]
    }
   ],
   "source": [
    "# train\n",
    "import os, time\n",
    "from sklearn.decomposition import LatentDirichletAllocation as skLatentDirichletAllocation\n",
    "\n",
    "n_components=20\n",
    "sk_lda = skLatentDirichletAllocation(n_components=n_components, max_iter=100)\n",
    "t1 = time.time()\n",
    "sk_lda.fit(X)\n",
    "t2 = time.time()\n",
    "print (\"train time: {:.3f} sec\".format(t2-t1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "topic 0: ['medical', 'health', 'disease', 'cancer', 'patients', 'use', 'drug', '1993', 'hiv', 'treatment']\n",
      "topic 1: ['edu', 'ftp', 'com', 'available', 'mail', 'server', 'graphics', 'pub', 'list', 'software']\n",
      "topic 2: ['car', 'just', 'like', 'good', 'power', 'don', 'used', 'use', 'new', 'bike']\n",
      "topic 3: ['db', 'cx', 'w7', 'ah', 'hz', 'bh', 'mv', 'c_', 'uw', 't7']\n",
      "topic 4: ['edu', 'com', 'ms', 'ed', 'myers', 'cs', 'university', 've', 'david', 'banks']\n",
      "topic 5: ['space', 'nasa', 'earth', 'research', 'data', 'launch', 'program', 'science', 'center', 'university']\n",
      "topic 6: ['know', 'thanks', 'like', 'does', 'just', 'don', 'problem', 've', 'help', 'mail']\n",
      "topic 7: ['gun', 'guns', 'states', 'file', 'state', 'control', 'crime', 'firearms', 'weapons', 'law']\n",
      "topic 8: ['drive', 'dos', 'card', 'disk', 'scsi', 'windows', 'pc', 'mac', 'software', 'video']\n",
      "topic 9: ['government', 'key', 'use', 'law', 'people', 'encryption', 'public', 'chip', 'security', 'clipper']\n",
      "topic 10: ['armenian', 'homosexual', 'history', 'men', 'genocide', 'sex', 'turkish', 'university', 'homosexuality', 'muslim']\n",
      "topic 11: ['water', 'dog', 'red', 'kuwait', 'heat', 'nuclear', 'pope', 'ra', 'blue', 'south']\n",
      "topic 12: ['game', 'team', 'year', 'games', 'season', 'play', 'players', 'hockey', 'good', 'league']\n",
      "topic 13: ['file', 'use', 'image', 'program', 'windows', 'window', 'files', 'jpeg', 'display', 'color']\n",
      "topic 14: ['don', 'people', 'think', 'know', 'just', 'said', 'like', 'going', 'did', 'time']\n",
      "topic 15: ['ax', 'max', 'g9v', 'b8f', 'a86', 'pl', '145', '1d9', '0t', '1t']\n",
      "topic 16: ['00', '50', '10', '03', '02', '04', '1st', '40', '05', '01']\n",
      "topic 17: ['10', '25', '12', '11', '15', '20', '14', '16', '13', '17']\n",
      "topic 18: ['israel', 'armenian', 'armenians', 'jews', 'war', 'people', 'israeli', 'said', 'jewish', 'turkish']\n",
      "topic 19: ['god', 'people', 'does', 'jesus', 'think', 'say', 'believe', 'don', 'just', 'know']\n"
     ]
    }
   ],
   "source": [
    "# predict\n",
    "feature_names = vec.get_feature_names()\n",
    "sorted = np.argsort(sk_lda.components_, axis=1)[:, ::-1]\n",
    "num_words = 10\n",
    "sorted_head = sorted[:,0:num_words]\n",
    "for i in range(n_components):\n",
    "    to_print = []\n",
    "    for j in range(num_words):\n",
    "        to_print.append(feature_names[sorted_head[i,j]])\n",
    "    print (\"topic {0}: {1}\".format(i, to_print))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
